#install.packages("glmnet")
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-6
Let’s recreate the simulations from lecture. We assume the predictors are normally distributed (with differing means and standard deviations) and the response is assumed \(Y=20 + 3X_1 - 2X_2 + \epsilon\) where epsilon is normal \((\mu=0, \sigma^2=4)\)
set.seed(35521)
x1 <- rnorm(30,0,3)
x2 <- rnorm(30,1,4)
y <- 20 + 3*x1 - 2*x2 + rnorm(length(x1), sd=2)
plot(y~x1+x2)
x <- cbind(x1, x2)
Now we fit a standard linear model to see how close we are to estimating the true model…
linmod <- lm(y~x1+x2)
summary(linmod)
##
## Call:
## lm(formula = y ~ x1 + x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7524 -0.9086 -0.1260 1.7005 3.0130
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.4425 0.3636 53.47 <2e-16 ***
## x1 3.0808 0.1335 23.07 <2e-16 ***
## x2 -2.1267 0.1097 -19.39 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.941 on 27 degrees of freedom
## Multiple R-squared: 0.9734, Adjusted R-squared: 0.9714
## F-statistic: 494.4 on 2 and 27 DF, p-value: < 2.2e-16
Now ridge regression…
rrsim_d <- cv.glmnet(x, y, alpha=0)
plot(rrsim_d$glmnet.fit, label=TRUE, xvar="lambda")
plot(rrsim_d)
rrsim_d$lambda.min
## [1] 0.8770535
Since the minimum was determined nearly at the lower boundary of \(\lambda\), it’s worthwhile to try a different grid of \(\lambda\) values than what the default provides…
grid <- exp(seq(10, -6, length=100))
set.seed(451642)
rrsim <- cv.glmnet(x, y, alpha=0, lambda=grid)
plot(rrsim$glmnet.fit, label=TRUE, xvar="lambda")
plot(rrsim)
rrsim$lambda.min
## [1] 0.1198862
Sure enough, the minimal predict error arises from a smaller \(\lambda\) than our original grid tested. We can now look at the actual model fits for the suggested values of lambda…
lammin <- rrsim$lambda.min
lam1se <- rrsim$lambda.1se
rrsimmin <- glmnet(x,y,alpha = 0, lambda=lammin)
rrsim1se <- glmnet(x,y,alpha = 0, lambda=lam1se)
coef(rrsimmin)
## 3 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 19.458267
## x1 3.050425
## x2 -2.106344
coef(rrsim1se)
## 3 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 19.581891
## x1 2.811993
## x2 -1.946284
Lets throw those results into a table for easy comparison…
coeftab <- cbind(c(20.00,3.00,-2.00), coef(linmod), coef(rrsimmin), coef(rrsim1se))
colnames(coeftab) <- c("TRUE", "LM", "RidgeRegMin", "RidgeReg1se")
round(coeftab,2)
## 3 x 4 sparse Matrix of class "dgCMatrix"
## TRUE LM RidgeRegMin RidgeReg1se
## (Intercept) 20 19.44 19.46 19.58
## x1 3 3.08 3.05 2.81
## x2 -2 -2.13 -2.11 -1.95
So ridge regression with the \(\lambda\) with estimated minimum MSE from CV provides estimates closer to the true values than the standard linear model approach as well as the ridge regression with \(\lambda\) within 1 standard error of the minimum. We can also run lasso using the glmnet command, the only difference is setting \(\alpha=1\) instead of 0…
lasim <- cv.glmnet(x, y, alpha=1, lambda=grid)
plot(lasim$glmnet.fit, label=TRUE, xvar="lambda")
plot(lasim)
lammin <- lasim$lambda.min
lam1se <- lasim$lambda.1se
lasimmin <- glmnet(x,y,alpha = 1, lambda=lammin)
lasim1se <- glmnet(x,y,alpha = 1, lambda=lam1se)
coef(lasimmin)
## 3 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 19.457600
## x1 3.050666
## x2 -2.101920
coef(lasim1se)
## 3 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 19.565775
## x1 2.834640
## x2 -1.924461
coeftab <- cbind(c(20.00,3.00,-2.00), coef(linmod), coef(rrsimmin), coef(lasimmin))
colnames(coeftab) <- c("TRUE", "LM", "RidgeRegMin", "LassoMin")
round(coeftab,2)
## 3 x 4 sparse Matrix of class "dgCMatrix"
## TRUE LM RidgeRegMin LassoMin
## (Intercept) 20 19.44 19.46 19.46
## x1 3 3.08 3.05 3.05
## x2 -2 -2.13 -2.11 -2.10
Results are essentially the same between ridge regression and lasso on this simulation. This is because to approach the true model, we only need to very slightly shrink the original estimates. Furthermore, there are no useless predictors to remove in this case. Let’s change that…
Let’s set up a model similar to one you would have seen in 570. A bunch of independent and useless predictors!
set.seed(52141)
bmat <- matrix(rnorm(50000), nrow=100)
dim(bmat)
## [1] 100 500
y <- rnorm(100)
bsimcv <- cv.glmnet(bmat, y, alpha=1)
plot(bsimcv)
plot(bsimcv$glmnet.fit, label=TRUE, xvar="lambda")
In this case, the lambda within 1 standard error appears to be at the boundary. So lets again manually specify a grid of lambda to see what happens…
grid <- exp(seq(-6, -1, length=100))
bsimcv2 <- cv.glmnet(bmat, y, alpha=1, lambda=grid)
plot(bsimcv2)
Interesting! The model that removes all predictors (and thereby only models according to the intercept) has a CV estimate of the MSE within 1 standard error of the minimum predicted MSE. This is a telltale sign that there are probably no useful predictors sitting in this data set!
Heres how those 3d plots were generated. Note that it’s a bit hacky, as I’m simply using points rather than a surface. I’ve spent a significant amount of time investigating other surface plots, and then deciding it was all too finicky to pull off and reverting back to this setup!
First I’ll setup the data
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
set.seed(35521)
x1 <- rnorm(30,0,3)
x2 <- x1+rnorm(30,0,1)
y <- 20 + 3*x1 + rnorm(length(x1), sd=1)
x <- cbind(x1, x2)
pairs(cbind(y,x))
now a grid for plotting…
b1b2 <- expand.grid(seq(-4, 4, by=0.1), seq(-4, 4, by=0.1))
b1b2 <- cbind(b1b2, rowSums(abs(b1b2)), rowSums(b1b2^2))
inters <- apply(b1b2, 1, function(v) mean(y)-v[1]*mean(x1)-v[2]*mean(x2))
params <- cbind(inters, b1b2)
rssgrid <- apply(params, 1, function(v) sum((y-(v[1]+v[2]*x1+v[3]*x2))^2))
plotit <- data.frame(b1=b1b2[,1], b2=b1b2[,2], rss=rssgrid)
###essentially unconstrained (penalty weak enough that global minimum exists)
library(plotly)
scene = list(camera = list(eye = list(x = -1.25, y = 1.25, z = 0)))
rr <- plotit[b1b2[,4]<=40,]
la <- plotit[b1b2[,3]<=10,]
Here’s the unconstrained ridge regression
plot_ly(rr, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
add_markers()
and the unconstrained LASSO…
plot_ly(la, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
add_markers()
now a bit more constraining…RR
#slight constraint
rr <- plotit[b1b2[,4]<=30,]
la <- plotit[b1b2[,3]<=7,]
plot_ly(rr, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
add_markers()
and LASSO
plot_ly(la, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
add_markers()
…more constraining..RR
rr <- plotit[b1b2[,4]<=20,]
la <- plotit[b1b2[,3]<=5,]
plot_ly(rr, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
add_markers()
…and LASSO
plot_ly(la, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
add_markers()
heavier constraints..RR
rr <- plotit[b1b2[,4]<=10,]
la <- plotit[b1b2[,3]<=2,]
plot_ly(rr, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
add_markers()
LASSO
plot_ly(la, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
add_markers()
and more…RR
rr <- plotit[b1b2[,4]<=1,]
la <- plotit[b1b2[,3]<=1,]
plot_ly(rr, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
add_markers()
plot_ly(la, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
add_markers()
and finally more again…RR
rr <- plotit[b1b2[,4]<=0.5,]
la <- plotit[b1b2[,3]<=0.5,]
plot_ly(rr, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
add_markers()
and LASSO.
plot_ly(la, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
add_markers()